/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/kernel.h>
#include <linux/in.h>

#include "rds.h"
#include "ib.h"

static struct rdma_cm_id *rds_ib_listen_id;

static void rds_ib_cm_fill_conn_param(struct rdma_conn_param *conn_param)
{
	memset(conn_param, 0, sizeof(struct rdma_conn_param));
	/* XXX tune these? */
	conn_param->responder_resources = 1;
	conn_param->initiator_depth = 1;
	conn_param->retry_count = 7;
	conn_param->rnr_retry_count = 7;
}

static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
	rdsdebug("event %u data %p\n", event->event, data);
}

static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
{
	struct rds_connection *conn = data;
	struct rds_ib_connection *ic = conn->c_transport_data;

	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);

	switch (event->event) {
		case IB_EVENT_COMM_EST:
			rds_connect_complete(conn);
			break;
		default:
			printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
			       "on connection to %u.%u.%u.%u\n", event->event,
			       NIPQUAD(conn->c_faddr));
			break;
	}
}

/*
 * This needs to be very careful to not leave IS_ERR pointers around for
 * cleanup to trip over.
 */
static int rds_ib_setup_qp(struct rds_connection *conn)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct ib_qp_init_attr attr;
	int ret;

	ic->i_pd = ib_alloc_pd(ic->i_cm_id->device);
	if (IS_ERR(ic->i_pd)) {
		ret = PTR_ERR(ic->i_pd);
		ic->i_pd = NULL;
		rdsdebug("ib_alloc_pd failed: %d\n", ret);
		goto out;
	}

	ic->i_mr = ib_get_dma_mr(ic->i_pd, IB_ACCESS_LOCAL_WRITE);
	if (IS_ERR(ic->i_mr)) {
		ret = PTR_ERR(ic->i_mr);
		ic->i_mr = NULL;
		rdsdebug("ib_get_dma_mr failed: %d\n", ret);
		goto out;
	}

	ic->i_send_cq = ib_create_cq(ic->i_cm_id->device,
				     rds_ib_send_cq_comp_handler,
				     rds_ib_cq_event_handler, conn,
				     ic->i_send_ring.w_nr);
	if (IS_ERR(ic->i_send_cq)) {
		ret = PTR_ERR(ic->i_send_cq);
		ic->i_send_cq = NULL;
		rdsdebug("ib_create_cq send failed: %d\n", ret);
		goto out;
	}

	ic->i_recv_cq = ib_create_cq(ic->i_cm_id->device,
				     rds_ib_recv_cq_comp_handler,
				     rds_ib_cq_event_handler, conn,
				     ic->i_recv_ring.w_nr);
	if (IS_ERR(ic->i_recv_cq)) {
		ret = PTR_ERR(ic->i_recv_cq);
		ic->i_recv_cq = NULL;
		rdsdebug("ib_create_cq recv failed: %d\n", ret);
		goto out;
	}

	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
	if (ret) {
		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
		goto out;
	}

	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
	if (ret) {
		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
		goto out;
	}

	/* XXX negotiate max send/recv with remote? */
	memset(&attr, 0, sizeof(attr));
	attr.event_handler = rds_ib_qp_event_handler;
	attr.qp_context = conn;
	/* + 1 to allow for the single ack message */
	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
	attr.cap.max_send_sge = 1;
	attr.cap.max_recv_sge = 1;
	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
	attr.qp_type = IB_QPT_RC;
	attr.send_cq = ic->i_send_cq;
	attr.recv_cq = ic->i_recv_cq;

	/* 
	 * XXX this can fail if max_*_wr is too large?  Are we supposed
	 * to back off until we get a value that the hardware can support?
	 */
	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
	if (ret) {
		rdsdebug("ib_req_notify_cq failed: %d\n", ret);
		goto out;
	}

	ic->i_send_hdrs = dma_alloc_coherent(ic->i_cm_id->device->dma_device,
					   ic->i_send_ring.w_nr *
					   	sizeof(struct rds_header),
					   &ic->i_send_hdrs_dma, GFP_KERNEL);
	if (ic->i_send_hdrs == NULL) {
		ret = -ENOMEM;
		rdsdebug("dma_alloc_coherent send failed\n");
		goto out;
	}

	ic->i_recv_hdrs = dma_alloc_coherent(ic->i_cm_id->device->dma_device,
					   ic->i_recv_ring.w_nr *
					   	sizeof(struct rds_header),
					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
	if (ic->i_recv_hdrs == NULL) {
		ret = -ENOMEM;
		rdsdebug("dma_alloc_coherent recv failed\n");
		goto out;
	}

	ic->i_ack = dma_alloc_coherent(ic->i_cm_id->device->dma_device,
				       sizeof(struct rds_ib_ack),
				       &ic->i_ack_dma, GFP_KERNEL);
	if (ic->i_ack == NULL) {
		ret = -ENOMEM;
		rdsdebug("dma_alloc_coherent ack failed\n");
		goto out;
	}

	ic->i_sends = kmalloc(ic->i_send_ring.w_nr *
				sizeof(struct rds_ib_send_work), GFP_KERNEL);
	if (ic->i_sends == NULL) {
		ret = -ENOMEM;
		rdsdebug("send allocation failed\n");
		goto out;
	}

	ic->i_recvs = kmalloc(ic->i_recv_ring.w_nr *
				sizeof(struct rds_ib_recv_work), GFP_KERNEL);
	if (ic->i_recvs == NULL) {
		ret = -ENOMEM;
		rdsdebug("recv allocation failed\n");
		goto out;
	}

	rds_ib_send_init_ring(ic);
	rds_ib_recv_init_ring(ic);
	rds_ib_recv_init_ack(ic);

	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
		 ic->i_send_cq, ic->i_recv_cq);

out:
	return ret;
}

static int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
				    struct rdma_cm_event *event)
{
	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
	struct rds_connection *conn;
	struct rds_ib_connection *ic;
	struct rdma_conn_param conn_param;
	int ret;

	rdsdebug("saddr %u.%u.%u.%u daddr %u.%u.%u.%u lguid 0x%llx fguid "
		 "0x%llx\n", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr),
		 (unsigned long long)be64_to_cpu(lguid),
		 (unsigned long long)be64_to_cpu(fguid));

	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
			       GFP_KERNEL);
	if (IS_ERR(conn)) {
		ret = PTR_ERR(conn);
		goto out;
	}
	ic = conn->c_transport_data;

	/*
	 * see the comment above rds_queue_delayed_reconnect()
	 */
	if (test_and_set_bit(RDS_CONN_CONNECTING, &conn->c_status)) {
		if (test_bit(RDS_CONN_CONNECTED, &conn->c_status))
			rds_ib_stats_inc(s_ib_listen_closed_stale);
		else
			rds_ib_stats_inc(s_ib_connect_raced);
		queue_work(rds_wq, &conn->c_down_w);
		ret = 0;
		goto out;
	}

	BUG_ON(ic->i_cm_id);
	BUG_ON(cm_id->context);

	ic->i_cm_id = cm_id;
	cm_id->context = conn;

	ret = rds_ib_setup_qp(conn);
	if (ret)
		goto out;

	rds_ib_cm_fill_conn_param(&conn_param);

	/* rdma_accept() calls rdma_reject() internally if it fails */
	ret = rdma_accept(cm_id, &conn_param);
	cm_id = NULL;
out:
	if (ret && cm_id)
		rdma_reject(cm_id, NULL, 0);
	return ret;
}

static int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
{
	struct rds_connection *conn = cm_id->context;
	struct rdma_conn_param conn_param;
	struct rds_ib_connect_private dp;
	int ret;

	ret = rds_ib_setup_qp(conn);
	if (ret)
		goto out;

	dp.dp_saddr = conn->c_laddr;
	dp.dp_daddr = conn->c_faddr;

	rds_ib_cm_fill_conn_param(&conn_param);
	conn_param.private_data = &dp;
	conn_param.private_data_len = sizeof(struct rds_ib_connect_private);

	ret = rdma_connect(cm_id, &conn_param);
	if (ret)
		rdsdebug("rdma_connect failed: %d\n", ret);

out:
	return ret;
}

static int rds_ib_cm_event_handler(struct rdma_cm_id *cm_id,
				   struct rdma_cm_event *event)
{
	/* this can be null in the listening path */
	struct rds_connection *conn = cm_id->context;
	int ret = 0;

	rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
		 event->event);

	switch (event->event) {
		case RDMA_CM_EVENT_CONNECT_REQUEST:
			ret = rds_ib_cm_handle_connect(cm_id, event);
			break;

		case RDMA_CM_EVENT_ADDR_RESOLVED:
			/* XXX do we need to clean up if this fails? */
			ret = rdma_resolve_route(cm_id,
						 RDS_IB_RESOLVE_TIMEOUT_MS);
			break;

		case RDMA_CM_EVENT_ROUTE_RESOLVED:
			/* XXX worry about racing with listen acceptance */
			ret = rds_ib_cm_initiate_connect(cm_id);
			break;

		case RDMA_CM_EVENT_ESTABLISHED:
			rds_connect_complete(conn);
			break;

		case RDMA_CM_EVENT_ADDR_ERROR:
		case RDMA_CM_EVENT_ROUTE_ERROR:
		case RDMA_CM_EVENT_CONNECT_ERROR:
		case RDMA_CM_EVENT_UNREACHABLE:
		case RDMA_CM_EVENT_REJECTED:
		case RDMA_CM_EVENT_DEVICE_REMOVAL:
			if (conn)
				queue_work(rds_wq, &conn->c_down_w);
			break;

		case RDMA_CM_EVENT_DISCONNECTED:
			queue_work(rds_wq, &conn->c_down_w);
			break;

		default:
			/* things like device disconnect? */
			printk(KERN_ERR "unknown event %u\n", event->event);
			BUG();
			break;
	}

	rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
	return ret;
}

int rds_ib_conn_connect(struct rds_connection *conn)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct sockaddr_in src, dest;
	int ret;

	/* XXX I wonder what affect the port space has */
	ic->i_cm_id = rdma_create_id(rds_ib_cm_event_handler, conn,
				     RDMA_PS_TCP);
	if (IS_ERR(ic->i_cm_id)) {
		ret = PTR_ERR(ic->i_cm_id);
		ic->i_cm_id = NULL;
		rdsdebug("rdma_create_id() failed: %d\n", ret);
		goto out;
	} 

	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);

	src.sin_family = AF_INET;
	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
	src.sin_port = (__force u16)htons(0);

	dest.sin_family = AF_INET;
	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
	dest.sin_port = (__force u16)htons(RDS_PORT);

	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
				(struct sockaddr *)&dest,
				RDS_IB_RESOLVE_TIMEOUT_MS);
	if (ret) {
		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
			 ret);
		rdma_destroy_id(ic->i_cm_id);
		ic->i_cm_id = NULL;
	}

out:
	return ret;
}

/*
 * This is so careful about only cleaning up resources that were built up
 * so that it can be called at any point during startup.  In fact it
 * can be called multiple times for a given connection.
 */
void rds_ib_conn_shutdown(struct rds_connection *conn)
{
	struct rds_ib_connection *ic = conn->c_transport_data;

	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 
		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);

	ic->i_wc_err = 1;

	if (ic->i_cm_id) {
		rdsdebug("disconnectiong cm %p\n", ic->i_cm_id);
		rdma_disconnect(ic->i_cm_id);
		/* XXX can this ever hang indefinitely? */
		wait_event(rds_ib_ring_empty_wait,
			   rds_ib_ring_empty(&ic->i_send_ring) &&
			   rds_ib_ring_empty(&ic->i_recv_ring));

		if (ic->i_send_hdrs)
			dma_free_coherent(ic->i_cm_id->device->dma_device,
					   ic->i_send_ring.w_nr *
					   	sizeof(struct rds_header),
					   ic->i_send_hdrs,
					   ic->i_send_hdrs_dma);

		if (ic->i_recv_hdrs)
			dma_free_coherent(ic->i_cm_id->device->dma_device,
					   ic->i_recv_ring.w_nr *
					   	sizeof(struct rds_header),
					   ic->i_recv_hdrs,
					   ic->i_recv_hdrs_dma);

		if (ic->i_ack)
			dma_free_coherent(ic->i_cm_id->device->dma_device,
					  sizeof(struct rds_ib_ack),
					  ic->i_ack, ic->i_ack_dma);

		if (ic->i_map_count)
			dma_unmap_sg(ic->i_cm_id->device->dma_device,
				     ic->i_map_sg, ARRAY_SIZE(ic->i_map_sg),
				     DMA_TO_DEVICE);

		if (ic->i_sends)
			rds_ib_send_clear_ring(ic);
		if (ic->i_recvs)
			rds_ib_recv_clear_ring(ic);

		if (ic->i_cm_id->qp)
			rdma_destroy_qp(ic->i_cm_id);
		if (ic->i_send_cq)
			ib_destroy_cq(ic->i_send_cq);
		if (ic->i_recv_cq)
			ib_destroy_cq(ic->i_recv_cq);
		if (ic->i_mr)
			ib_dereg_mr(ic->i_mr);
		if (ic->i_pd)
			ib_dealloc_pd(ic->i_pd);
		rdma_destroy_id(ic->i_cm_id);

		ic->i_cm_id = NULL;
		ic->i_pd = NULL;
                ic->i_mr = NULL;
		ic->i_send_cq = NULL;
		ic->i_recv_cq = NULL;
		ic->i_send_hdrs = NULL;
		ic->i_recv_hdrs = NULL;
		ic->i_ack = NULL;
		ic->i_map_count = 0;
	}

	if (ic->i_ibinc) {
		rds_inc_put(&ic->i_ibinc->ii_inc);
		ic->i_ibinc = NULL;
	}

	kfree(ic->i_sends);
	ic->i_sends = NULL;
	kfree(ic->i_recvs);
	ic->i_recvs = NULL;

	ic->i_wc_err = 0;
}

int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
	struct rds_ib_connection *ic;

	/* XXX too lazy? */
	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
	if (ic == NULL)
		return -ENOMEM;

	mutex_init(&ic->i_recv_mutex);
	spin_lock_init(&ic->i_ack_lock);

	/* 
	 * rds_ib_conn_shutdown() waits for these to be emptied so they
	 * must be initialized before it can be called.
	 */
	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);

	conn->c_transport_data = ic;

	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
	return 0;
}

void rds_ib_conn_free(void *arg)
{
	struct rds_ib_connection *ic = arg;
	rdsdebug("ic %p\n", ic);
	kfree(ic);
}

int __init rds_ib_listen_init(void)
{
	struct sockaddr_in sin;
	struct rdma_cm_id *cm_id;
	int ret;

	cm_id = rdma_create_id(rds_ib_cm_event_handler, NULL, RDMA_PS_TCP);
	if (IS_ERR(cm_id)) {
		ret = PTR_ERR(cm_id);
		printk(KERN_ERR "RDS/ib: failed to setup listener, "
		       "rdma_create_id() returned %d\n", ret);
		goto out;
	}

	sin.sin_family = PF_INET,
	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
	sin.sin_port = (__force u16)htons(RDS_PORT);

	/*
	 * XXX I bet this binds the cm_id to a device.  If we want to support
	 * fail-over we'll have to take this into consideration.
	 */
	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
	if (ret) {
		printk(KERN_ERR "RDS/ib: failed to setup listener, "
		       "rdma_bind_addr() returned %d\n", ret);
		goto out;
	}

	ret = rdma_listen(cm_id, 128);
	if (ret) {
		printk(KERN_ERR "RDS/ib: failed to setup listener, "
		       "rdma_listen() returned %d\n", ret);
		goto out;
	}

	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);

	rds_ib_listen_id = cm_id;
	cm_id = NULL;
out:
	if (cm_id)
		rdma_destroy_id(cm_id);
	return ret;
}

void rds_ib_listen_stop(void)
{
	if (rds_ib_listen_id) {
		rdsdebug("cm %p\n", rds_ib_listen_id);
		rdma_destroy_id(rds_ib_listen_id);
		rds_ib_listen_id = NULL;
	}
}
